d <-read_csv(paste0(home, "/output/temp_data_for_fitting.csv"))#> Rows: 98616 Columns: 13#> ── Column specification ────────────────────────────────────────────────────────#> Delimiter: ","#> chr (4): area, source, db, id#> dbl (8): year, temp, yday, month, day, VkDag, stn_nr, section_nr#> date (1): date#> #> ℹ Use `spec()` to retrieve the full column specification for this data.#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.d <- d |>mutate(area =as.factor(area),source_f =as.factor(source),year_f =as.factor(year))#> mutate: converted 'area' from character to factor (0 new NA)#> new variable 'source_f' (factor) with 3 unique values and 0% NA#> new variable 'year_f' (factor) with 83 unique values and 0% NA# Drop VN, no logger data?d |>group_by(area) |>summarise(n_source =length(unique(source_f))) #> group_by: one grouping variable (area)#> summarise: now 12 rows and 2 columns, ungrouped#> # A tibble: 12 × 2#> area n_source#> <fct> <int>#> 1 BS 3#> 2 BT 3#> 3 FB 3#> 4 FM 3#> 5 HO 3#> 6 JM 3#> 7 MU 3#> 8 RA 3#> 9 SI_EK 3#> 10 SI_HA 3#> 11 TH 3#> 12 VN 2# Drop data in SI_HA and BT before onset of warming?d2 <- d |>mutate(discard ="N",discard =ifelse(area =="BT"& year <=1980, "Y", discard),discard =ifelse(area =="SI_HA"& year <=1972, "Y", discard)) |>filter(discard =="N")#> mutate: new variable 'discard' (character) with 2 unique values and 0% NA#> filter: removed 1,146 rows (1%), 97,470 rows remaining
Fit models
Source as independent or interactive effect
Code
# library(brms)# # Too slow...# testm <- brm(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), # data = d |> sample_n(5000),# family = student(),# chains = 2,# cores = 2,# knots = list(yday = c(0.5, 364.5))# )# # summary(testm)# # detach("package:brms", unload = TRUE)# # library(QRM)# # fit.st(data = d |> sample_n(500))
Code
m <-sdmTMB(temp ~ area*year_f + source_f +s(yday, by = area, bs ="cc"), data = d,family =student(df =5),spatial ="off",spatiotemporal ="off",knots =list(yday =c(0.5, 364.5)),control =sdmTMBcontrol(newton_loops =1))
sanity(m)#> ✔ Non-linear minimizer suggests successful convergence#> ✔ Hessian matrix is positive definite#> ✔ No extreme or very small eigenvalues detected#> ✔ No gradients with respect to fixed effects are >= 0.001#> ✔ No fixed-effect standard errors are NA#> ✔ No standard errors look unreasonably large#> ✔ No sigma parameters are < 0.01#> ✔ No sigma parameters are > 100#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning#> Inf#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning#> -Inf#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning#> Inf#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning#> -Infggplot(d, aes(sample = mcmc_res)) +stat_qq() +stat_qq_line() +labs(y ="Sample Quantiles", x ="Theoretical Quantiles") +theme(aspect.ratio =1)
Code
ggsave(paste0(home, "/figures/supp/qq_temp.pdf"), width =11, height =11, units ="cm")
Predict
Code
# Make a new data frame and predict!nd <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),area =unique(d$area),year =unique(d$year))) |>mutate(source ="logger") |>mutate(id =paste(year, area, sep ="_"),source_f =as.factor(source),year_f =as.factor(year)) #> mutate: new variable 'source' (character) with one unique value and 0% NA#> mutate: new variable 'id' (character) with 996 unique values and 0% NA#> new variable 'source_f' (factor) with one unique value and 0% NA#> new variable 'year_f' (factor) with 83 unique values and 0% NA# Predictnd$pred <-predict(m, newdata = nd)$est
In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas.. so FM informs BT prior to nuclear
Code
nd <- nd |>mutate(keep ="Y",keep =ifelse(area =="BT"& year <1980, "N", keep),keep =ifelse(area =="SI_HA"& year <1972, "N", keep)) |>filter(keep =="Y")#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA#> filter: removed 26,352 rows (7%), 338,184 rows remainingnd_sub <- nd |>mutate(keep ="N",keep =ifelse(area =="FM"& year <1980, "Y", keep), # use FM instead of BTkeep =ifelse(area =="SI_EK"& year <1972, "Y", keep)) |># use SI_EK instead of SI_HAfilter(keep =="Y")#> mutate: changed 311,832 values (92%) of 'keep' (0 new NA)#> filter: removed 311,832 rows (92%), 26,352 rows remaining# Now change the labels to BT and SI_EK...nd_sub <- nd_sub |>mutate(area =ifelse(area =="FM", "BT", "SI_HA"))#> mutate: converted 'area' from factor to character (0 new NA)# Bind rows and plot only the temperature series we will use for growth modellingnd <-bind_rows(nd, nd_sub) |>select(-keep)#> select: dropped one variable (keep)
Keep track of the years for which we have cohorts for matching with cohort data
Code
gdat <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/perch-growth/master/data/for-analysis/dat.csv")#> Rows: 364546 Columns: 11#> ── Column specification ────────────────────────────────────────────────────────#> Delimiter: ","#> chr (5): age_ring, area, gear, ID, sex#> dbl (6): length_mm, age_bc, age_catch, catch_year, cohort, final_length#> #> ℹ Use `spec()` to retrieve the full column specification for this data.#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.gdat$area_cohort_age <-as.factor(paste(gdat$area, gdat$cohort, gdat$age_bc))gdat <- gdat |>group_by(area_cohort_age) |>filter(n() >10)#> group_by: one grouping variable (area_cohort_age)#> filter (grouped): removed 5,488 rows (2%), 359,058 rows remaininggdat <- gdat |>filter(age_catch >3) |>group_by(area) |>summarise(min =min(cohort)) |>arrange(min)#> filter (grouped): removed 121,647 rows (34%), 237,411 rows remaining#> group_by: one grouping variable (area)#> summarise: now 12 rows and 2 columns, ungroupednd <-left_join(nd, gdat, by ="area") |>mutate(growth_dat =ifelse(year >= min, "Y", "N"))#> left_join: added one column (min)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 364,536#> > =========#> > rows total 364,536#> mutate: new variable 'growth_dat' (character) with 2 unique values and 0% NA
Plot predictions
Code
nd |>filter(growth_dat =="Y") |>ggplot(aes(yday, y = year, fill = pred, color = pred)) +geom_raster() +facet_wrap(~area, ncol =3) +scale_fill_viridis(option ="magma") +scale_color_viridis(option ="magma") +labs(x ="Yearday", y ="Year", color ="Predicted SST (°C)", fill ="Predicted SST (°C)")#> filter: removed 157,380 rows (43%), 207,156 rows remaining
Growing season? This might be different for different areas…
Code
# Find day of the year where temperature exceeds 10C by area across yearsgs_area <- nd |>group_by(area, yday) |>summarise(mean_pred =mean(pred)) |>ungroup() |>filter(mean_pred >10) |>group_by(area) |>summarise(gs_min =min(yday),gs_max =max(yday))#> group_by: 2 grouping variables (area, yday)#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)#> ungroup: no grouping variables#> filter: removed 2,501 rows (57%), 1,891 rows remaining#> group_by: one grouping variable (area)#> summarise: now 12 rows and 3 columns, ungroupednd <-left_join(nd, gs_area, by ="area")#> left_join: added 2 columns (gs_min, gs_max)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 364,536#> > =========#> > rows total 364,536gs_area$mean_pred <-10# Plot!nd |>filter(growth_dat =="Y") |>group_by(area, yday) |>summarise(mean_pred =mean(pred)) |>ggplot(aes(yday, mean_pred)) +geom_line() +labs(x ="Yearday", y ="Year", color ="Predicted SST (°C)", fill ="Predicted SST (°C)") +facet_wrap(~area) +geom_point(data = gs_area, aes(gs_min, mean_pred), inherit.aes =FALSE, color ="tomato2") +geom_point(data = gs_area, aes(gs_max, mean_pred), inherit.aes =FALSE, color ="tomato2") +geom_hline(yintercept =10, linetype =2)#> filter: removed 157,380 rows (43%), 207,156 rows remaining#> group_by: 2 grouping variables (area, yday)#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)
Detailed exploration of predictions
Code
# Loop trough all areas, plot temperature as a function of yday, color by data source, facet by yearfor(i inunique(nd$area)) { plotdat <- nd |>filter(area == i)print(ggplot(plotdat, aes(yday, pred, linetype ="Model prediction (logger)")) +scale_color_brewer(palette ="Accent") +facet_wrap(~year) +geom_point(data =filter(d, area == i & year >min(plotdat$year)), aes(yday, temp, color = source),size =0.75) +geom_line() +labs(title =paste("Area = ", i), color ="", linetype ="") +guides(color =guide_legend(title.position="top", title.hjust =0.5)) +theme_sleek(base_size =8) +theme(legend.position =c(0.7, 0.03), legend.direction ="horizontal",legend.spacing.y =unit(-0.3, "cm")) +labs(x ="Day of the year", y ="Predicted SST (°C)") )ggsave(paste0(home, "/figures/supp/temp_pred_yday_area_", i, ".pdf" ), width =17, height =17, units ="cm")}#> filter: removed 334,158 rows (92%), 30,378 rows remaining#> filter: removed 95,167 rows (97%), 3,449 rows remaining#> filter: removed 334,158 rows (92%), 30,378 rows remaining#> filter: removed 89,164 rows (90%), 9,452 rows remaining
Now see if there is a systematic pattern in the difference between predicted and observed logger data, which could indicate that the source effect isn’t global but area-specific.
Code
dlog <- d |>filter(source =="logger") |>mutate(type ="data",id =paste(area, year, yday, sep ="_")) |>select(id, temp) |>group_by(id) |>summarise(obs =mean(temp)) # sometimes we have more than 1 observation per id#> filter: removed 34,632 rows (35%), 63,984 rows remaining#> mutate: changed 63,984 values (100%) of 'id' (63984 fewer NA)#> new variable 'type' (character) with one unique value and 0% NA#> select: dropped 14 variables (year, area, yday, month, date, …)#> group_by: one grouping variable (id)#> summarise: now 60,967 rows and 2 columns, ungrouped# dlog |> # group_by(id) |> # summarise(n = n()) |> # distinct(n)preds_log <- nd |>filter(growth_dat =="Y"& source =="logger") |>mutate(type ="model",id =paste(area, year, yday, sep ="_")) |>filter(id %in%unique(dlog$id)) |>ungroup() |>left_join(dlog, by ="id")#> filter: removed 157,380 rows (43%), 207,156 rows remaining#> mutate: changed 207,156 values (100%) of 'id' (0 new NA)#> new variable 'type' (character) with one unique value and 0% NA#> filter: removed 146,189 rows (71%), 60,967 rows remaining#> ungroup: no grouping variables#> left_join: added one column (obs)#> > rows only in x 0#> > rows only in y ( 0)#> > matched rows 60,967#> > ========#> > rows total 60,967preds_log |>mutate(resid = pred - obs) |>ggplot(aes(as.factor(area), resid, group =as.factor(area))) +#geom_jitter(alpha = 0.05, color = "grey20", height = 0, width = 0.2) + geom_violin(fill ="grey70", color =NA) +geom_boxplot(width =0.2, outlier.colour =NA, outlier.color =NA, outlier.fill =NA) +guides(color ="none") +geom_hline(yintercept =0, linetype =2, color ="tomato3", linewidth =0.75) +labs(x ="Area", y ="Manual residuals")#> mutate: new variable 'resid' (double) with 60,967 unique values and 0% NA
Make final plot
Code
order <- preds |>group_by(area) |>summarise(mean_temp =mean(temp)) |>arrange(desc(mean_temp))#> group_by: one grouping variable (area)#> summarise: now 12 rows and 2 columns, ungroupedorder#> # A tibble: 12 × 2#> area mean_temp#> <chr> <dbl>#> 1 SI_HA 14.3 #> 2 BT 13.2 #> 3 TH 10.4 #> 4 VN 10.0 #> 5 SI_EK 9.32#> 6 FM 9.06#> 7 JM 8.66#> 8 BS 8.64#> 9 MU 8.64#> 10 FB 8.17#> 11 HO 6.97#> 12 RA 6.81# Save plot order..topt <-10.6# Overall t_opt from 02-fit-vbge.qmd! Update if needed# Add latitudearea <-c("BS", "BT", "FB", "FM", "HO", "JM", "MU", "RA", "SI_EK", "SI_HA", "TH", "VN")nareas <-length(area)lat <-c(60, 60.4, 60.3, 60.5, 63.7, 58, 59, 65.9, 57.3, 57.4, 56.1, 57.5)lon <-c(21.5, 18.1, 19.5, 18, 20.9, 16.8, 18.1, 22.3, 16.6, 16.7, 15.9, 16.9)area_attr <-data.frame(cbind(area = area, lat = lat, lon = lon)) |>mutate_at(c("lat","lon"), as.numeric) |>arrange(desc(lat))#> mutate_at: converted 'lat' from character to double (0 new NA)#> converted 'lon' from character to double (0 new NA)ggplot(preds, aes(year, temp, color = temp)) +facet_wrap(~factor(area, levels = area_attr$area), ncol =3) +geom_hline(yintercept = topt, linewidth =0.3, linetype =2, color ="grey") +geom_line() +labs(x ="Year", y ="Model-predicted annual average temperature") +scale_color_viridis(option ="magma", name ="Area") +guides(color ="none")
Code
preds |>group_by(area) |>summarise(min =min(year),max =max(year)) |>arrange(min)#> group_by: one grouping variable (area)#> summarise: now 12 rows and 3 columns, ungrouped#> # A tibble: 12 × 3#> area min max#> <chr> <dbl> <dbl>#> 1 JM 1953 2022#> 2 FM 1963 2022#> 3 SI_HA 1966 2022#> 4 SI_EK 1968 2022#> 5 FB 1969 2022#> 6 BT 1971 2022#> 7 MU 1981 2022#> 8 HO 1982 2022#> 9 BS 1985 2022#> 10 RA 1985 2022#> 11 VN 1987 2022#> 12 TH 2000 2022ggsave(paste0(home, "/figures/annual_average_temperature.pdf"), width =17, height =17, units ="cm")
Code
# Save prediction dfwrite_csv(preds, paste0(home, "/output/gam_predicted_temps.csv"))
If we want to get uncertainty, we can use nsim instead; this simulates from the linear predictor using the inverse precision matrix, which is a fast way to get a distribution of samples from which we can take e.g. quantiles and means. However, it’s still slow, so the code below isn’t executed yet.
Code
nd_sim <-data.frame(expand.grid(yday =seq(min(d$yday), max(d$yday), by =1),area =unique(d$area),year =unique(d$year))) |>mutate(source ="logger") |>mutate(id =paste(year, area, sep ="_"),source_f =as.factor(source),year_f =as.factor(year))# Trim!nd_sim <-left_join(nd_sim, gdat, by ="area")nd_sim <- nd_sim |>mutate(growth_dat =ifelse(year > min, "Y", "N")) |>filter(growth_dat =="Y") |>filter(yday %in%c(gs_min:gs_min)) |>mutate(area =as.factor(area))# Predict!nsim <-500m_pred_sims <-predict(m, newdata = nd_sim, nsim = nsim)# Join sims with prediction datand_sim_long <-cbind(nd_sim, as.data.frame(m_pred_sims)) |>pivot_longer(c((ncol(nd_sim) +1):(nsim +ncol(nd_sim))))# Summarize sims over growing seasonsum_pred_gs <- nd_sim_long |>ungroup() |>group_by(year, area) |>summarise(lwr =quantile(value, prob =0.1),est =quantile(value, prob =0.5),upr =quantile(value, prob =0.9)) |>ungroup()# In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas..sum_pred_gs <- preds |>mutate(keep ="Y",keep =ifelse(area =="BT"& year <1980, "N", keep),keep =ifelse(area =="SI_HA"& year <1972, "N", keep)) |>filter(keep =="Y")sum_pred_gs_sub <- preds |>mutate(keep ="N",keep =ifelse(area =="FM"& year <1980, "Y", keep), # use FM instead of BTkeep =ifelse(area =="SI_EK"& year <1972, "Y", keep)) |># use SI_EK instead of SI_HAfilter(keep =="Y")# Now change the labels to BT and SI_EK...sum_pred_gs_sub <- sum_pred_gs_sub |>mutate(area =ifelse(area =="FM", "BT", "SI_HA"))# Bind rows and plot only the temperature series we will use for growth modellingsum_pred_gs <-bind_rows(sum_pred_gs, sum_pred_gs_sub) |>select(-keep, -type)order <- sum_pred_gs |>group_by(area) |>summarise(mean_temp =mean(temp)) |>arrange(desc(mean_temp))